OUAHMANE TARIQ

1: UNDESTAND THE PROBLEM STATEMENT AND BUSINESS CASE

alt text

2: IMPORT LIBRARIES/DATASETS AND PERFORM EXPLORATORY DATA ANALYSIS

In [7]:
# import key libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
import nltk
import re
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
import plotly.express as px

# Tensorflow
import tensorflow as tf
from tensorflow.keras.preprocessing.text import one_hot,Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Embedding, Input, LSTM, Conv1D, MaxPool1D, Bidirectional, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
In [8]:
# load the stock news data
stock_df = pd.read_csv("stock_sentiment.csv")
In [9]:
# Let's view the dataset 
stock_df
Out[9]:
Text Sentiment
0 Kickers on my watchlist XIDE TIT SOQ PNK CPW B... 1
1 user: AAP MOVIE. 55% return for the FEA/GEED i... 1
2 user I'd be afraid to short AMZN - they are lo... 1
3 MNTA Over 12.00 1
4 OI Over 21.37 1
... ... ...
5786 Industry body CII said #discoms are likely to ... 0
5787 #Gold prices slip below Rs 46,000 as #investor... 0
5788 Workers at Bajaj Auto have agreed to a 10% wag... 1
5789 #Sharemarket LIVE: Sensex off day’s high, up 6... 1
5790 #Sensex, #Nifty climb off day's highs, still u... 1

5791 rows × 2 columns

In [10]:
# dataframe information
stock_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5791 entries, 0 to 5790
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Text       5791 non-null   object
 1   Sentiment  5791 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 90.6+ KB
In [11]:
# check for null values
stock_df.isnull().sum()
Out[11]:
Text         0
Sentiment    0
dtype: int64

3: PERFORM DATA CLEANING (REMOVE PUNCTUATIONS FROM TEXT)

In [12]:
import string
string.punctuation
Out[12]:
'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
In [13]:
Test = '$I love AI & Machine learning!!'
Test_punc_removed = [char for char in Test if char not in string.punctuation]
Test_punc_removed_join = ''.join(Test_punc_removed)
Test_punc_removed_join
Out[13]:
'I love AI  Machine learning'
In [14]:
# Let's define a function to remove punctuations
def remove_punc(message):
    Test_punc_removed = [char for char in message if char not in string.punctuation]
    Test_punc_removed_join = ''.join(Test_punc_removed)

    return Test_punc_removed_join
In [15]:
# Let's remove punctuations from our dataset 
stock_df['Text Without Punctuation'] = stock_df['Text'].apply(remove_punc)
In [16]:
stock_df
Out[16]:
Text Sentiment Text Without Punctuation
0 Kickers on my watchlist XIDE TIT SOQ PNK CPW B... 1 Kickers on my watchlist XIDE TIT SOQ PNK CPW B...
1 user: AAP MOVIE. 55% return for the FEA/GEED i... 1 user AAP MOVIE 55 return for the FEAGEED indic...
2 user I'd be afraid to short AMZN - they are lo... 1 user Id be afraid to short AMZN they are look...
3 MNTA Over 12.00 1 MNTA Over 1200
4 OI Over 21.37 1 OI Over 2137
... ... ... ...
5786 Industry body CII said #discoms are likely to ... 0 Industry body CII said discoms are likely to s...
5787 #Gold prices slip below Rs 46,000 as #investor... 0 Gold prices slip below Rs 46000 as investors b...
5788 Workers at Bajaj Auto have agreed to a 10% wag... 1 Workers at Bajaj Auto have agreed to a 10 wage...
5789 #Sharemarket LIVE: Sensex off day’s high, up 6... 1 Sharemarket LIVE Sensex off day’s high up 600 ...
5790 #Sensex, #Nifty climb off day's highs, still u... 1 Sensex Nifty climb off days highs still up 2 K...

5791 rows × 3 columns

In [17]:
stock_df['Text'][2]
Out[17]:
"user I'd be afraid to short AMZN - they are looking like a near-monopoly in eBooks and infrastructure-as-a-service"
In [18]:
stock_df['Text Without Punctuation'][2]
Out[18]:
'user Id be afraid to short AMZN  they are looking like a nearmonopoly in eBooks and infrastructureasaservice'

4: PERFORM DATA CLEANING (REMOVE STOPWORDS)

In [19]:
# download stopwords
nltk.download("stopwords")
stopwords.words('english')
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\GGG\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
Out[19]:
['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each',
 'few',
 'more',
 'most',
 'other',
 'some',
 'such',
 'no',
 'nor',
 'not',
 'only',
 'own',
 'same',
 'so',
 'than',
 'too',
 'very',
 's',
 't',
 'can',
 'will',
 'just',
 'don',
 "don't",
 'should',
 "should've",
 'now',
 'd',
 'll',
 'm',
 'o',
 're',
 've',
 'y',
 'ain',
 'aren',
 "aren't",
 'couldn',
 "couldn't",
 'didn',
 "didn't",
 'doesn',
 "doesn't",
 'hadn',
 "hadn't",
 'hasn',
 "hasn't",
 'haven',
 "haven't",
 'isn',
 "isn't",
 'ma',
 'mightn',
 "mightn't",
 'mustn',
 "mustn't",
 'needn',
 "needn't",
 'shan',
 "shan't",
 'shouldn',
 "shouldn't",
 'wasn',
 "wasn't",
 'weren',
 "weren't",
 'won',
 "won't",
 'wouldn',
 "wouldn't"]
In [20]:
# Obtain additional stopwords from nltk
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use','will','aap','co','day','user','stock','today','week','year'])
In [21]:
# Remove stopwords and remove short words (less than 2 characters)
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text): #put all the words compose the text in a list
        if len(token) >= 3 and token not in stop_words:
            result.append(token)
            
    return result
In [22]:
# apply pre-processing to the text column
stock_df['Text Without Punc & Stopwords'] = stock_df['Text Without Punctuation'].apply(preprocess)
In [23]:
stock_df['Text'][0]
Out[23]:
'Kickers on my watchlist XIDE TIT SOQ PNK CPW BPZ AJ  trade method 1 or method 2, see prev posts'
In [24]:
stock_df['Text Without Punc & Stopwords'][0]
Out[24]:
['kickers',
 'watchlist',
 'xide',
 'tit',
 'soq',
 'pnk',
 'cpw',
 'bpz',
 'trade',
 'method',
 'method',
 'see',
 'prev',
 'posts']
In [25]:
stock_df
Out[25]:
Text Sentiment Text Without Punctuation Text Without Punc & Stopwords
0 Kickers on my watchlist XIDE TIT SOQ PNK CPW B... 1 Kickers on my watchlist XIDE TIT SOQ PNK CPW B... [kickers, watchlist, xide, tit, soq, pnk, cpw,...
1 user: AAP MOVIE. 55% return for the FEA/GEED i... 1 user AAP MOVIE 55 return for the FEAGEED indic... [movie, return, feageed, indicator, trades, aw...
2 user I'd be afraid to short AMZN - they are lo... 1 user Id be afraid to short AMZN they are look... [afraid, short, amzn, looking, like, nearmonop...
3 MNTA Over 12.00 1 MNTA Over 1200 [mnta]
4 OI Over 21.37 1 OI Over 2137 []
... ... ... ... ...
5786 Industry body CII said #discoms are likely to ... 0 Industry body CII said discoms are likely to s... [industry, body, cii, said, discoms, likely, s...
5787 #Gold prices slip below Rs 46,000 as #investor... 0 Gold prices slip below Rs 46000 as investors b... [gold, prices, slip, investors, book, profits,...
5788 Workers at Bajaj Auto have agreed to a 10% wag... 1 Workers at Bajaj Auto have agreed to a 10 wage... [workers, bajaj, auto, agreed, wage, cut, peri...
5789 #Sharemarket LIVE: Sensex off day’s high, up 6... 1 Sharemarket LIVE Sensex off day’s high up 600 ... [sharemarket, live, sensex, high, points, nift...
5790 #Sensex, #Nifty climb off day's highs, still u... 1 Sensex Nifty climb off days highs still up 2 K... [sensex, nifty, climb, days, highs, still, key...

5791 rows × 4 columns

5: PLOT WORDCLOUD

In [26]:
# join the words into a string
stock_df['Text Without Punc & Stopwords Joined'] = stock_df['Text Without Punc & Stopwords'].apply(lambda x: " ".join(x))
In [27]:
# plot the word cloud for text with positive sentiment
plt.figure(figsize = (20, 20)) 
wc = WordCloud(max_words = 1000 , width = 1600 , height = 800).generate(" ".join(stock_df[stock_df['Sentiment'] == 1]['Text Without Punc & Stopwords Joined']))
plt.imshow(wc, interpolation = 'bilinear');

6: VISUALIZE CLEANED DATASETS

In [28]:
stock_df
Out[28]:
Text Sentiment Text Without Punctuation Text Without Punc & Stopwords Text Without Punc & Stopwords Joined
0 Kickers on my watchlist XIDE TIT SOQ PNK CPW B... 1 Kickers on my watchlist XIDE TIT SOQ PNK CPW B... [kickers, watchlist, xide, tit, soq, pnk, cpw,... kickers watchlist xide tit soq pnk cpw bpz tra...
1 user: AAP MOVIE. 55% return for the FEA/GEED i... 1 user AAP MOVIE 55 return for the FEAGEED indic... [movie, return, feageed, indicator, trades, aw... movie return feageed indicator trades awesome
2 user I'd be afraid to short AMZN - they are lo... 1 user Id be afraid to short AMZN they are look... [afraid, short, amzn, looking, like, nearmonop... afraid short amzn looking like nearmonopoly eb...
3 MNTA Over 12.00 1 MNTA Over 1200 [mnta] mnta
4 OI Over 21.37 1 OI Over 2137 []
... ... ... ... ... ...
5786 Industry body CII said #discoms are likely to ... 0 Industry body CII said discoms are likely to s... [industry, body, cii, said, discoms, likely, s... industry body cii said discoms likely suffer n...
5787 #Gold prices slip below Rs 46,000 as #investor... 0 Gold prices slip below Rs 46000 as investors b... [gold, prices, slip, investors, book, profits,... gold prices slip investors book profits amid c...
5788 Workers at Bajaj Auto have agreed to a 10% wag... 1 Workers at Bajaj Auto have agreed to a 10 wage... [workers, bajaj, auto, agreed, wage, cut, peri... workers bajaj auto agreed wage cut period apri...
5789 #Sharemarket LIVE: Sensex off day’s high, up 6... 1 Sharemarket LIVE Sensex off day’s high up 600 ... [sharemarket, live, sensex, high, points, nift... sharemarket live sensex high points nifty test...
5790 #Sensex, #Nifty climb off day's highs, still u... 1 Sensex Nifty climb off days highs still up 2 K... [sensex, nifty, climb, days, highs, still, key... sensex nifty climb days highs still key factor...

5791 rows × 5 columns

In [29]:
nltk.download('punkt')
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\GGG\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
Out[29]:
True
In [30]:
# word_tokenize is used to break up a string into words
print(stock_df['Text Without Punc & Stopwords Joined'][0])
print(nltk.word_tokenize(stock_df['Text Without Punc & Stopwords Joined'][0]))
kickers watchlist xide tit soq pnk cpw bpz trade method method see prev posts
['kickers', 'watchlist', 'xide', 'tit', 'soq', 'pnk', 'cpw', 'bpz', 'trade', 'method', 'method', 'see', 'prev', 'posts']
In [31]:
# Obtain the maximum length of data in the document
# This will be later used when word embeddings are generated
maxlen = -1
for doc in stock_df['Text Without Punc & Stopwords Joined']:
    tokens = nltk.word_tokenize(doc)
    if(maxlen < len(tokens)):
        maxlen = len(tokens)
print("The maximum number of words in any document is:", maxlen)
The maximum number of words in any document is: 20
In [32]:
tweets_length = [ len(nltk.word_tokenize(x)) for x in stock_df['Text Without Punc & Stopwords Joined'] ]
tweets_length
Out[32]:
[14,
 6,
 7,
 1,
 0,
 1,
 8,
 13,
 8,
 4,
 10,
 18,
 8,
 8,
 11,
 5,
 12,
 10,
 12,
 4,
 6,
 5,
 2,
 5,
 3,
 10,
 3,
 3,
 9,
 6,
 8,
 10,
 8,
 3,
 10,
 11,
 4,
 8,
 11,
 10,
 10,
 7,
 9,
 8,
 4,
 10,
 8,
 7,
 7,
 9,
 10,
 10,
 8,
 2,
 14,
 12,
 9,
 8,
 1,
 14,
 7,
 11,
 5,
 13,
 6,
 6,
 7,
 6,
 6,
 10,
 9,
 4,
 10,
 13,
 16,
 6,
 9,
 6,
 2,
 11,
 5,
 4,
 9,
 11,
 16,
 4,
 9,
 5,
 6,
 2,
 1,
 5,
 9,
 4,
 7,
 6,
 7,
 10,
 1,
 3,
 9,
 4,
 16,
 8,
 10,
 11,
 18,
 6,
 7,
 13,
 12,
 9,
 3,
 3,
 10,
 10,
 8,
 6,
 8,
 5,
 5,
 8,
 9,
 15,
 3,
 5,
 9,
 18,
 13,
 11,
 3,
 1,
 1,
 2,
 7,
 12,
 12,
 8,
 8,
 10,
 9,
 10,
 15,
 8,
 9,
 9,
 12,
 10,
 6,
 10,
 13,
 4,
 11,
 8,
 15,
 13,
 8,
 12,
 5,
 5,
 3,
 7,
 2,
 4,
 10,
 5,
 6,
 10,
 6,
 16,
 9,
 6,
 5,
 10,
 9,
 10,
 8,
 2,
 10,
 9,
 10,
 12,
 7,
 10,
 7,
 3,
 6,
 5,
 9,
 9,
 4,
 6,
 5,
 4,
 4,
 8,
 10,
 8,
 8,
 13,
 9,
 7,
 10,
 12,
 8,
 6,
 5,
 3,
 10,
 11,
 6,
 9,
 10,
 13,
 10,
 7,
 12,
 7,
 11,
 10,
 4,
 4,
 3,
 10,
 2,
 9,
 10,
 15,
 12,
 10,
 13,
 9,
 8,
 2,
 1,
 12,
 12,
 6,
 9,
 9,
 12,
 5,
 2,
 3,
 8,
 6,
 14,
 5,
 6,
 7,
 9,
 1,
 1,
 11,
 7,
 14,
 4,
 4,
 9,
 5,
 4,
 9,
 9,
 10,
 13,
 5,
 11,
 4,
 4,
 6,
 1,
 8,
 4,
 11,
 4,
 6,
 11,
 4,
 10,
 8,
 8,
 6,
 10,
 3,
 10,
 10,
 2,
 9,
 10,
 13,
 9,
 10,
 3,
 2,
 8,
 4,
 6,
 14,
 5,
 4,
 1,
 6,
 11,
 11,
 9,
 5,
 7,
 7,
 11,
 13,
 9,
 5,
 6,
 10,
 5,
 11,
 8,
 17,
 14,
 10,
 3,
 3,
 5,
 6,
 3,
 5,
 4,
 5,
 13,
 13,
 5,
 9,
 10,
 6,
 5,
 4,
 10,
 8,
 11,
 6,
 9,
 9,
 6,
 8,
 3,
 7,
 3,
 3,
 10,
 8,
 4,
 5,
 12,
 10,
 2,
 10,
 2,
 1,
 12,
 13,
 5,
 6,
 3,
 13,
 15,
 5,
 11,
 10,
 4,
 14,
 11,
 4,
 6,
 11,
 8,
 6,
 7,
 12,
 4,
 12,
 4,
 2,
 5,
 14,
 15,
 13,
 10,
 15,
 4,
 10,
 5,
 11,
 1,
 6,
 4,
 6,
 8,
 4,
 13,
 7,
 10,
 13,
 8,
 10,
 8,
 8,
 2,
 10,
 10,
 12,
 11,
 1,
 7,
 9,
 13,
 10,
 12,
 3,
 3,
 4,
 10,
 7,
 4,
 9,
 2,
 7,
 11,
 5,
 11,
 3,
 14,
 7,
 7,
 13,
 4,
 11,
 5,
 6,
 7,
 11,
 5,
 9,
 9,
 8,
 9,
 9,
 6,
 7,
 4,
 11,
 10,
 9,
 2,
 2,
 4,
 9,
 4,
 12,
 12,
 1,
 9,
 12,
 4,
 9,
 4,
 6,
 10,
 13,
 1,
 0,
 1,
 2,
 13,
 3,
 8,
 9,
 6,
 6,
 4,
 6,
 9,
 6,
 5,
 3,
 5,
 12,
 6,
 3,
 14,
 11,
 15,
 2,
 4,
 14,
 5,
 6,
 6,
 6,
 13,
 5,
 9,
 9,
 10,
 3,
 7,
 12,
 4,
 3,
 10,
 2,
 3,
 2,
 9,
 3,
 3,
 4,
 9,
 3,
 7,
 4,
 5,
 14,
 8,
 5,
 4,
 6,
 5,
 10,
 5,
 11,
 4,
 4,
 7,
 5,
 4,
 12,
 4,
 2,
 12,
 4,
 12,
 13,
 5,
 7,
 4,
 10,
 5,
 8,
 6,
 5,
 12,
 11,
 11,
 13,
 4,
 2,
 12,
 8,
 4,
 1,
 2,
 4,
 14,
 7,
 9,
 13,
 5,
 8,
 10,
 5,
 10,
 5,
 6,
 9,
 9,
 10,
 5,
 10,
 13,
 2,
 9,
 6,
 11,
 5,
 8,
 8,
 12,
 8,
 7,
 2,
 6,
 6,
 7,
 2,
 11,
 10,
 9,
 3,
 11,
 4,
 8,
 5,
 11,
 10,
 4,
 3,
 10,
 6,
 11,
 2,
 5,
 5,
 6,
 9,
 15,
 11,
 7,
 8,
 11,
 5,
 15,
 3,
 10,
 9,
 6,
 4,
 8,
 8,
 3,
 2,
 12,
 7,
 4,
 7,
 10,
 1,
 1,
 4,
 11,
 16,
 12,
 5,
 13,
 8,
 4,
 7,
 14,
 12,
 9,
 2,
 4,
 9,
 10,
 6,
 11,
 12,
 5,
 6,
 2,
 9,
 9,
 12,
 1,
 2,
 10,
 5,
 9,
 2,
 2,
 8,
 6,
 11,
 5,
 6,
 9,
 5,
 11,
 3,
 13,
 5,
 6,
 5,
 12,
 9,
 2,
 7,
 11,
 7,
 5,
 9,
 10,
 3,
 8,
 9,
 10,
 11,
 11,
 5,
 12,
 5,
 4,
 11,
 11,
 13,
 12,
 1,
 8,
 7,
 12,
 3,
 7,
 12,
 1,
 3,
 7,
 7,
 5,
 5,
 6,
 7,
 3,
 5,
 13,
 8,
 12,
 9,
 1,
 4,
 11,
 7,
 5,
 7,
 16,
 9,
 8,
 13,
 13,
 14,
 7,
 2,
 10,
 10,
 6,
 13,
 1,
 12,
 4,
 4,
 4,
 5,
 8,
 3,
 5,
 13,
 3,
 10,
 3,
 13,
 7,
 8,
 4,
 1,
 8,
 7,
 6,
 12,
 12,
 5,
 6,
 6,
 10,
 11,
 10,
 7,
 3,
 10,
 11,
 11,
 15,
 12,
 12,
 7,
 6,
 12,
 15,
 9,
 9,
 10,
 2,
 7,
 6,
 7,
 11,
 10,
 9,
 4,
 5,
 5,
 8,
 7,
 10,
 1,
 7,
 6,
 4,
 3,
 9,
 8,
 10,
 7,
 6,
 6,
 10,
 8,
 5,
 7,
 7,
 6,
 8,
 4,
 7,
 9,
 9,
 7,
 4,
 10,
 9,
 4,
 1,
 3,
 4,
 1,
 13,
 8,
 4,
 7,
 4,
 9,
 12,
 7,
 4,
 9,
 3,
 1,
 3,
 10,
 11,
 3,
 7,
 4,
 12,
 4,
 5,
 7,
 12,
 7,
 13,
 4,
 12,
 2,
 8,
 4,
 7,
 5,
 6,
 13,
 14,
 8,
 9,
 3,
 1,
 9,
 0,
 9,
 7,
 11,
 5,
 14,
 3,
 6,
 8,
 8,
 11,
 3,
 5,
 3,
 2,
 2,
 2,
 8,
 4,
 5,
 5,
 4,
 9,
 3,
 3,
 6,
 6,
 5,
 15,
 8,
 1,
 10,
 11,
 8,
 7,
 7,
 6,
 5,
 5,
 5,
 8,
 11,
 6,
 10,
 7,
 4,
 10,
 1,
 6,
 6,
 7,
 12,
 4,
 4,
 7,
 9,
 6,
 12,
 15,
 9,
 8,
 10,
 7,
 8,
 5,
 11,
 10,
 6,
 10,
 6,
 8,
 7,
 8,
 4,
 9,
 7,
 12,
 10,
 14,
 8,
 3,
 5,
 0,
 13,
 14,
 11,
 7,
 7,
 2,
 6,
 7,
 9,
 7,
 5,
 10,
 13,
 9,
 3,
 2,
 11,
 12,
 4,
 4,
 7,
 14,
 7,
 14,
 7,
 3,
 7,
 8,
 17,
 9,
 14,
 11,
 10,
 6,
 10,
 2,
 15,
 10,
 4,
 3,
 4,
 6,
 10,
 10,
 11,
 ...]
In [33]:
# Plot the distribution for the number of words in a text
fig = px.histogram(x = tweets_length, nbins = 50)
fig.show()

7: PREPARE THE DATA BY TOKENIZING AND PADDING

alt text

In [34]:
stock_df
Out[34]:
Text Sentiment Text Without Punctuation Text Without Punc & Stopwords Text Without Punc & Stopwords Joined
0 Kickers on my watchlist XIDE TIT SOQ PNK CPW B... 1 Kickers on my watchlist XIDE TIT SOQ PNK CPW B... [kickers, watchlist, xide, tit, soq, pnk, cpw,... kickers watchlist xide tit soq pnk cpw bpz tra...
1 user: AAP MOVIE. 55% return for the FEA/GEED i... 1 user AAP MOVIE 55 return for the FEAGEED indic... [movie, return, feageed, indicator, trades, aw... movie return feageed indicator trades awesome
2 user I'd be afraid to short AMZN - they are lo... 1 user Id be afraid to short AMZN they are look... [afraid, short, amzn, looking, like, nearmonop... afraid short amzn looking like nearmonopoly eb...
3 MNTA Over 12.00 1 MNTA Over 1200 [mnta] mnta
4 OI Over 21.37 1 OI Over 2137 []
... ... ... ... ... ...
5786 Industry body CII said #discoms are likely to ... 0 Industry body CII said discoms are likely to s... [industry, body, cii, said, discoms, likely, s... industry body cii said discoms likely suffer n...
5787 #Gold prices slip below Rs 46,000 as #investor... 0 Gold prices slip below Rs 46000 as investors b... [gold, prices, slip, investors, book, profits,... gold prices slip investors book profits amid c...
5788 Workers at Bajaj Auto have agreed to a 10% wag... 1 Workers at Bajaj Auto have agreed to a 10 wage... [workers, bajaj, auto, agreed, wage, cut, peri... workers bajaj auto agreed wage cut period apri...
5789 #Sharemarket LIVE: Sensex off day’s high, up 6... 1 Sharemarket LIVE Sensex off day’s high up 600 ... [sharemarket, live, sensex, high, points, nift... sharemarket live sensex high points nifty test...
5790 #Sensex, #Nifty climb off day's highs, still u... 1 Sensex Nifty climb off days highs still up 2 K... [sensex, nifty, climb, days, highs, still, key... sensex nifty climb days highs still key factor...

5791 rows × 5 columns

In [35]:
# Obtain the total words present in the dataset
list_of_words = []
for i in stock_df['Text Without Punc & Stopwords']:
    for j in i:
        list_of_words.append(j)
In [36]:
# Obtain the total number of unique words
total_words = len(list(set(list_of_words)))
total_words
Out[36]:
9268
In [37]:
# split the data into test and train 
X = stock_df['Text Without Punc & Stopwords']
y = stock_df['Sentiment']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)
In [38]:
X_train.shape
Out[38]:
(5211,)
In [39]:
X_test.shape
Out[39]:
(580,)
In [40]:
X_train
Out[40]:
1417    [gmc, gap, puts, weakest, floor, wouldnt, surp...
5625    [women, ujjwala, yojana, given, free, lpg, cyl...
4809    [position, boss, took, position, long, time, s...
4418                                       [time, batman]
1892       [znga, bearish, dont, know, poker, incredible]
                              ...                        
1207    [volume, support, levels, ish, broadening, top...
1755                                       [nailed, goog]
1944    [gmc, cst, report, bell, expecting, sfy, type,...
5291    [outbreak, coronavirus, swept, jpmorganâ, trad...
3081                           [tjx, stop, loss, beating]
Name: Text Without Punc & Stopwords, Length: 5211, dtype: object
In [41]:
# Create a tokenizer to tokenize the words and create sequences of tokenized words
tokenizer = Tokenizer(num_words = total_words)
tokenizer.fit_on_texts(X_train)

# Training data
train_sequences = tokenizer.texts_to_sequences(X_train)

# Testing data
test_sequences = tokenizer.texts_to_sequences(X_test)
In [42]:
print("The encoding for document\n", X_train[1],"\n is: ", train_sequences[1])
The encoding for document
 ['movie', 'return', 'feageed', 'indicator', 'trades', 'awesome'] 
 is:  [2461, 3632, 3633, 1123, 483, 2462, 2463, 16, 599, 261, 14, 359, 1282, 3634, 3635, 3636]
In [43]:
# Add padding to training and testing to make all samples with same size
padded_train = pad_sequences(train_sequences, maxlen = 29, padding = 'post', truncating = 'post')
padded_test = pad_sequences(test_sequences, maxlen = 29, truncating = 'post')
In [44]:
#for examples of 3 samples we add 0 to have same size
for i, doc in enumerate(padded_train[:3]):
     print("The padded encoding for document:", i+1," is:", doc)
The padded encoding for document: 1  is: [ 231   53  108 3630 1281  855 1521   20 3631  409  320  125 1522  695
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0]
The padded encoding for document: 2  is: [2461 3632 3633 1123  483 2462 2463   16  599  261   14  359 1282 3634
 3635 3636    0    0    0    0    0    0    0    0    0    0    0    0
    0]
The padded encoding for document: 3  is: [  55 3637  251   55    4   28    1  232  220 2464 1124    1  600   81
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0]
In [45]:
# Convert the data to categorical 2D representation
y_train_cat = to_categorical(y_train, 2)
y_test_cat = to_categorical(y_test, 2)
In [46]:
y_train_cat.shape
Out[46]:
(5211, 2)
In [47]:
y_test_cat.shape
Out[47]:
(580, 2)
In [48]:
y_train_cat
Out[48]:
array([[1., 0.],
       [0., 1.],
       [1., 0.],
       ...,
       [0., 1.],
       [1., 0.],
       [0., 1.]], dtype=float32)

8: UNDERSTAND THE THEORY AND INTUITION BEHIND RECURRENT NEURAL NETWORKS AND LONG SHORT TERM MEMORY NETWORKS (LSTM)

alt text

alt text

alt text

9: BUILD A CUSTOM-BASED DEEP NEURAL NETWORK TO PERFORM SENTIMENT ANALYSIS

alt text

In [49]:
# Sequential Model
model = Sequential()

# embedding layer
model.add(Embedding(total_words, output_dim = 512))

# Bi-Directional RNN and LSTM
model.add(LSTM(256))

# Dense layers
model.add(Dense(128, activation = 'relu'))
model.add(Dropout(0.3))
model.add(Dense(2,activation = 'softmax'))
model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['acc'])
model.summary()
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding (Embedding)        (None, None, 512)         4745216   
_________________________________________________________________
lstm (LSTM)                  (None, 256)               787456    
_________________________________________________________________
dense (Dense)                (None, 128)               32896     
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 258       
=================================================================
Total params: 5,565,826
Trainable params: 5,565,826
Non-trainable params: 0
_________________________________________________________________
In [ ]:
# train the model
model.fit(padded_train, y_train_cat, batch_size = 32, validation_split = 0.2, epochs = 2)
Epoch 1/2
 13/131 [=>............................] - ETA: 1:03 - loss: 0.6729 - acc: 0.5541

10: ASSESS TRAINED MODEL PERFORMANCE

alt text

In [ ]:
# make prediction
pred = model.predict(padded_test)
In [ ]:
np.argmax?
In [ ]:
# make prediction
prediction = []
for i in pred:
  prediction.append(np.argmax(i))
In [ ]:
# list containing original values
original = []
for i in y_test_cat:
  original.append(np.argmax(i))
In [ ]:
# acuracy score on text data
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(original, prediction)
accuracy
In [ ]:
# Plot the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(original, prediction)
sns.heatmap(cm, annot = True)